Contents

library(Matrix)
library(scran)
library(Rtsne)
library(irlba)
library(cowplot)

source("/nfs/research1/marioni/jonny/chimera-tal1/scripts/core_functions.R")
load_data()

nPC = 50

In this script, we perform batch correction on our data.

1 Batch correction

For batch correction, we employ the scran function fastMNN, which performs batch correction in the manner of mnnCorrect, but in the PC-space, and much faster. Critically, this is a composition-aware batch-correction strategy that should not be affected by the lack of e.g. blood cells in the knockout samples. We correct within each timepoint only.

1.1 Total correction

hvgs = getHVGs(sce)

correct = doBatchCorrect(counts = logcounts(sce[hvgs,]),
                         timepoints = as.character(meta$tomato), #first correct genotypes separately
                         samples = meta$sample,
                         timepoint_order = c("FALSE", "TRUE"), #host cells first
                         sample_order = 1:4 #doesn't matter as pairwise correction
                         )


corrected = list(all = correct[match(meta$cell, rownames(correct)),])
base = prcomp_irlba(t(logcounts(scater::normalize(sce[hvgs, ]))), n = nPC)$x

saveRDS(corrected, file = "/nfs/research1/marioni/jonny/chimera-tal1/data/corrected_pcas.rds")
saveRDS(base, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/base_pca.rds")

A t-SNE visualisation of our cells, pre- and post-correction, is shown in Figure 1.

tsne_pre = Rtsne(base, pca = FALSE)$Y
tsne_post = Rtsne(corrected$all, pca = FALSE)$Y

ro = sample(nrow(base), nrow(base))




p1 = ggplot(as.data.frame(tsne_pre)[ro,], aes(x = V1, y = V2, col = factor(meta$sample)[ro])) +
  geom_point(size = 0.4) +
  scale_colour_manual(values = c("3" = "black", "1" = "red", "2" = "coral", "4" = "darkgrey")) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  ggtitle("Pre-correction")

p2 = ggplot(as.data.frame(tsne_post)[ro,], aes(x = V1, y = V2, col = factor(meta$sample)[ro])) +
  geom_point(size = 0.4) +
  scale_colour_manual(values = c("3" = "black", "1" = "red", "2" = "coral", "4" = "darkgrey")) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  ggtitle("Post-correction")


plot_grid(p1, p2, nrow = 2)
t-SNE of cells before and after correction. Red and coral cells are Tomato+ (injected), black and grey cells are Tomato- (embryo). Coral and grey cells mark the thrid and fourth samples in the E7.5 timepoint.

Figure 1: t-SNE of cells before and after correction
Red and coral cells are Tomato+ (injected), black and grey cells are Tomato- (embryo). Coral and grey cells mark the thrid and fourth samples in the E7.5 timepoint.

2 Celltype plots

Figure 2 shows the same plots, but coloured by the mapped celltype (see the mapping script for details). Doublets and stripped nuclei are excluded.

corrected_final = corrected$all[!meta$celltype.mapped %in% c("Stripped", "Doublet"),]
meta_final = meta[!meta$celltype.mapped %in% c("Stripped", "Doublet"),]

tsne_final = Rtsne(corrected_final, pca = FALSE)$Y

ro = sample(nrow(meta_final), nrow(meta_final))

plegend = ggplot(as.data.frame(tsne_final)[ro,], aes(x = V1, y = V2, col = factor(meta_final$celltype.mapped[ro], levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.4) +
  scale_colour_manual(values = celltype_colours, name = "") +
  theme(axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank())  +
  guides(col = guide_legend(override.aes = list(size = 5), ncol = 5))

p1 = ggplot(as.data.frame(tsne_final)[ro,], aes(x = V1, y = V2, col = factor(meta_final$celltype.mapped[ro], levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.4) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank())



plot_grid(p1, get_legend(plegend), ncol = 1)
t-SNE, coloured by celltype.

Figure 2: t-SNE, coloured by celltype

3 UMAP

Finally, we generate UMAP coordinates of the batch-corrected data. Doublets and stripped nuclei are excluded. The UMAP is shown in Figure 3.

write.table(corrected_final, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/scanpy_input.tab", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)

system("python3 /nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap.py")

umap = read.table("/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/scanpy_output.tab", sep = "\t", header = FALSE)
p1 = ggplot(as.data.frame(umap)[ro,], aes(x = V1, y = V2, col = factor(meta_final$celltype.mapped[ro], levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.2) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank())

plot_grid(p1, get_legend(plegend), ncol = 1)
UMAP of chimera cells.

Figure 3: UMAP of chimera cells

ggsave(p1, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap.pdf",
       width = 5, height = 5)

The same UMAP plotted for Tomato positive and negative cells is shown in Figure 4.

p1 = ggplot(as.data.frame(umap)[meta_final$tomato,][ro,], aes(x = V1, y = V2, 
                                                              col = factor(meta_final$celltype.mapped[meta_final$tomato][ro],
                                                                           levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.2) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  lims(x= c(min(umap$V1), max(umap$V1)),
       y= c(min(umap$V2), max(umap$V2)))

p2 = ggplot(as.data.frame(umap)[!meta_final$tomato,][ro,], aes(x = V1, y = V2, 
                                                              col = factor(meta_final$celltype.mapped[!meta_final$tomato][ro],
                                                                           levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.2) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  lims(x= c(min(umap$V1), max(umap$V1)),
       y= c(min(umap$V2), max(umap$V2)))

plot_grid(p1 + ggtitle("Tom+"), p2 + ggtitle("Tom-"), ncol = 2)
UMAP plotted separately for Tomato+ or Tomato- cells

Figure 4: UMAP plotted separately for Tomato+ or Tomato- cells

ggsave(p1, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap_tom+.pdf",
       width = 5, height = 5)

ggsave(p2, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap_tom-.pdf",
       width = 5, height = 5)

write.table(umap, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap.tab", sep = "\t", col.names = FALSE, row.names = FALSE, quote = FALSE)
rownames(umap) = meta_final$cell

top = which(!meta_final$celltype.mapped %in% c("Mixed mesoderm", "Notochord"))
bottom = which(meta_final$celltype.mapped %in% c("Mixed mesoderm", "Notochord"))
ro = c(top[sample(length(top), length(top))],
       bottom)

umap_manual = umap[ro,]
meta_manual = meta_final[ro,]
meta_manual$X = umap_manual[,1]
meta_manual$Y = umap_manual[,2]
 

p1 = ggplot(meta_manual[meta_manual$tomato,], aes(x = X, y = Y, col = factor(celltype.mapped, levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.2) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  lims(x= c(min(umap$V1), max(umap$V1)),
       y= c(min(umap$V2), max(umap$V2)))

p2 = ggplot(meta_manual[!meta_manual$tomato,], aes(x = X, y = Y, col = factor(celltype.mapped, levels = names(celltype_colours), ordered = TRUE))) +
  geom_point(size = 0.2) +
  scale_colour_manual(values = celltype_colours) +
  theme(legend.position = "none",
        axis.line = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_blank()) +
  lims(x= c(min(umap$V1), max(umap$V1)),
       y= c(min(umap$V2), max(umap$V2)))

ggsave(p1, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap_tom+_reordered.pdf",
       width = 5, height = 5)

ggsave(p2, file = "/nfs/research1/marioni/jonny/chimera-tal1/scripts/batch_correct/umap_tom-_reordered.pdf",
       width = 5, height = 5)

4 Session Info

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.1 LTS
## 
## Matrix products: default
## BLAS: /usr/local/lib/R/lib/libRblas.so
## LAPACK: /usr/local/lib/R/lib/libRlapack.so
## 
## locale:
## [1] C
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] biomaRt_2.37.3              scater_1.9.10              
##  [3] cowplot_0.9.3               ggplot2_3.0.0              
##  [5] irlba_2.3.2                 Rtsne_0.13                 
##  [7] scran_1.9.13                SingleCellExperiment_1.3.9 
##  [9] SummarizedExperiment_1.11.6 DelayedArray_0.7.21        
## [11] matrixStats_0.54.0          Biobase_2.41.2             
## [13] GenomicRanges_1.33.11       GenomeInfoDb_1.17.1        
## [15] IRanges_2.15.16             S4Vectors_0.19.19          
## [17] BiocGenerics_0.27.1         BiocParallel_1.15.8        
## [19] Matrix_1.2-14               BiocStyle_2.9.3            
## [21] rmarkdown_1.10             
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6             bit64_0.9-7             
##  [3] httr_1.3.1               progress_1.2.0          
##  [5] rprojroot_1.3-2          dynamicTreeCut_1.63-1   
##  [7] tools_3.5.0              backports_1.1.2         
##  [9] R6_2.2.2                 vipor_0.4.5             
## [11] DBI_1.0.0                lazyeval_0.2.1          
## [13] colorspace_1.3-2         withr_2.1.2             
## [15] prettyunits_1.0.2        tidyselect_0.2.4        
## [17] gridExtra_2.3            bit_1.1-14              
## [19] compiler_3.5.0           labeling_0.3            
## [21] bookdown_0.7             scales_0.5.0            
## [23] stringr_1.3.1            digest_0.6.15           
## [25] XVector_0.21.3           pkgconfig_2.0.1         
## [27] htmltools_0.3.6          highr_0.7               
## [29] limma_3.37.3             rlang_0.2.1             
## [31] RSQLite_2.1.1            DelayedMatrixStats_1.3.4
## [33] bindr_0.1.1              dplyr_0.7.6             
## [35] RCurl_1.95-4.11          magrittr_1.5            
## [37] GenomeInfoDbData_1.1.0   Rcpp_0.12.18            
## [39] ggbeeswarm_0.6.0         munsell_0.5.0           
## [41] Rhdf5lib_1.3.1           viridis_0.5.1           
## [43] stringi_1.2.4            yaml_2.2.0              
## [45] edgeR_3.23.3             zlibbioc_1.27.0         
## [47] rhdf5_2.25.4             plyr_1.8.4              
## [49] grid_3.5.0               blob_1.1.1              
## [51] crayon_1.3.4             lattice_0.20-35         
## [53] hms_0.4.2                locfit_1.5-9.1          
## [55] knitr_1.20               pillar_1.3.0            
## [57] igraph_1.2.1             rjson_0.2.20            
## [59] kmknn_0.99.16            reshape2_1.4.3          
## [61] XML_3.98-1.12            glue_1.3.0              
## [63] evaluate_0.11            data.table_1.11.4       
## [65] gtable_0.2.0             purrr_0.2.5             
## [67] assertthat_0.2.0         xfun_0.3                
## [69] viridisLite_0.3.0        tibble_1.4.2            
## [71] AnnotationDbi_1.43.1     beeswarm_0.2.3          
## [73] memoise_1.1.0            tximport_1.9.8          
## [75] bindrcpp_0.2.2           statmod_1.4.30